List of libraries used
library(viridis)
## Loading required package: viridisLite
library(zipcode)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble 2.1.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ✔ purrr 0.3.3
## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(formattable)
##
## Attaching package: 'formattable'
## The following object is masked from 'package:plotly':
##
## style
library(tibble)
filepath <- paste(getwd(),"/kc_house_data.csv",sep='')
my_df <- read.csv(filepath,stringsAsFactors = FALSE)
#order zipcode col in ascending order
#using dplyr library
my_df <- arrange(my_df, zipcode)
Getting City and State names for respective zipcodes. For easy understanding, extracted City and State names based on the zipcodes given in the file. Later, used this data to show in the tabels.
#loading the data for zipcodes
#using library 'ziocode'
data(zipcode)
#extracting all the zicodes from the dataset
city_state_names_for_zipcodes <- data.frame(c(my_df$zipcode))
#mapping the zipcodes to their respective states and cities
city_state_names_for_zipcodes$zip = clean.zipcodes(city_state_names_for_zipcodes$c.my_df.zipcode.)
data(zipcode)
city_state_names_for_zipcodes=merge(city_state_names_for_zipcodes,zipcode, by.x = 'zip', by.y='zip')
#merging city and state columns from the new dataframe to my_data
my_data <- cbind(my_df,city_state_names_for_zipcodes$city,city_state_names_for_zipcodes$state)
#remaning columns
colnames(my_data)[22] <- "City"
colnames(my_data)[23] <- "State"
Extracting the year and month from the column “date” (was in the format “20141013T000000”) which indicates the year in which the house was sold. The following code is to get the month, date and year in which the house was sold.
#extracting the date from the column "date"
get_date <- as.Date(my_data$date,"%Y%m%dT000000")
#extracting just the year from the date
year_sold <- as.numeric(format(get_date,'%Y'))
#extracting the month in which the house was sold
month_sold <- as.numeric(format(get_date,'%m'))
#adding a column with just the year sold and month sold to the data frame "my_data"
my_data <- cbind(my_data,year_sold, month_sold)
Number of houses built in the years 2000 to 2015 in King County, USA
#used dplyr, ggolot2 and tidyr libraries
#subsetting data with year built >= 2000
newdf <- filter(my_data, yr_built >= "2000")
#counting the no. of houses built for each year
houses_built_per_year <- as.data.frame(table(newdf$yr_built))
# using ggplot 2 to plot the above data
ggplot(houses_built_per_year, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity",fill = "#FF6666")+
geom_text(aes(label=Freq), vjust=-0.9)+
ylim(c(0, 600))+
labs(title="No. of houses built in each year", y="Count", x="Years")+
theme(plot.title=element_text(size=16,
face="bold",
family="Times",
color="black",
hjust=0.5,
lineheight=1.2),
axis.title = element_text(colour = "black", size = 12))
From the above graph, we can see that the year 2014 has the most number of houses built
Average price of houses with 0-10 bedrooms over the years 2000 to 2015
#used potly library
#sorting the the column "bedrooms"
newdf <- newdf[order(newdf$bedrooms),]
#plotting using plotly
price <- newdf$price/100000
bedroom <- c(0:10)
plot_ly(newdf, x = ~sqft_living, y = ~price, type = 'scatter', mode = 'markers', color = ~bedrooms, text = ~bedrooms,
marker = list(opacity = 0.5),width = 900,height = 500)%>%
layout(title = 'Price distribution by sqft and no. of bedrooms',
xaxis = list(showgrid = FALSE, title = "Sqft"),
yaxis = list(showgrid = FALSE))%>%
add_trace(
type = 'scatter',
mode = 'markers',
hovertemplate = paste('Price: %{y:$.2f}','<br><b>Sqft</b>: %{x}<br>','Bedrooms: %{text}<extra></extra>'),
showlegend = FALSE
)
When you click on the graph and hover over each bubble, it gives you a snapshot of the number of bedrooms you get for that price and sqft
Which Zipcode has the most expensive houses. Most desirable zipcodes in King County
#used tidyverse
#getting the mean and median price of the house
avg_house_price <- as.data.frame(aggregate(price ~ zipcode, newdf, mean))
median_house_price <- as.data.frame(aggregate(price ~ zipcode, newdf, median))
avg_median_merge <- merge(avg_house_price,median_house_price,by="zipcode")
#using plotly to plot the mean and median house prices per zipcode
plot_ly(x = as.factor(avg_median_merge$zipcode),width = 900,height = 500) %>%
layout(title = "Average and Median house prices per zipcode",
xaxis = list(title = "Zipcode"),
yaxis = list (title = "Price"),
autosize = F) %>%
add_lines(y = avg_median_merge$price.x, color = I("red"), name = "Average price",hoverinfo = 'text',
text = ~paste('</br> Avg price: ', avg_median_merge$price.x,
'</br> Zipcode: ', avg_median_merge$zipcode)) %>%
add_lines(y = avg_median_merge$price.y, color = I("yellow"), name = "Median price",hoverinfo = 'text',
text = ~paste('</br> Median price: ', avg_median_merge$price.y,
'</br> Zipcode: ', avg_median_merge$zipcode))
When you click on the graph above and hover over any point on the line, it shows you the mean and median price of houses in that zip code. The red line shows the average price of the houses, and the yellow line shows the median price of houses. We can see that the price gradually increases for the as we move from zipcode 98039 to 98039. Compared to other zip codes, the mean and median price in 98039 is higher.
#used formattable, dplyr and tibble libraries
#subsetting data for year 2014
year_2014 <- filter(my_data, year_sold == "2014")
year_2015 <- filter(my_data, year_sold == "2015")
#creating a table containting number of houses sold in each month in the year 2014 per City
table_2014 <- as.data.frame.matrix(table(year_2014$City, year_2014$month_sold))
colnames(table_2014) <- c("May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
#naming the first col in the matrix
table_2014 <- table_2014 %>%
rownames_to_column(var = "City")
#styling the table
#creating a function to check the max value in each col
max.col.may = function(x)(max(table_2014$May)==x)
max.col.jun = function(y)(max(table_2014$Jun)==y)
max.col.jul = function(z)(max(table_2014$Jul)==z)
max.col.aug = function(a)(max(table_2014$Aug)==a)
max.col.sep = function(b)(max(table_2014$Sep)==b)
max.col.oct = function(c)(max(table_2014$Oct)==c)
max.col.nov = function(d)(max(table_2014$Nov)==d)
max.col.dec = function(e)(max(table_2014$Dec)==e)
#formatting the table to highlight the max number each column
df_2014 <- formattable(table_2014, list('May' = color_bar("#FA614B66", fun = max.col.may),
'Jun' = color_bar("#FA614B66", fun = max.col.jun),
'Jul' = color_bar("#FA614B66", fun = max.col.jul),
'Aug' = color_bar("#FA614B66", fun = max.col.aug),
'Sep' = color_bar("#FA614B66", fun = max.col.sep),
'Oct' = color_bar("#FA614B66", fun = max.col.oct),
'Nov' = color_bar("#FA614B66", fun = max.col.nov),
'Dec' = color_bar("#FA614B66", fun = max.col.dec)))
#creating a table containting number of houses sold in each month in the year 2014 per City
table_2015 <- as.data.frame.matrix(table(year_2015$City, year_2015$month_sold))
colnames(table_2015) <- c("Jan","Feb","Mar","Apr","May")
#naming the first col in the matrix
table_2015 <- table_2015 %>%
rownames_to_column(var = "City")
#styling the table
#creating a function to check the max value in each col
max.col.jan = function(h)(max(table_2015$Jan)==h)
max.col.feb = function(i)(max(table_2015$Feb)==i)
max.col.mar = function(j)(max(table_2015$Mar)==j)
max.col.apr = function(k)(max(table_2015$Apr)==k)
max.col.may2 = function(l)(max(table_2015$May)==l)
#formatting the table to highlight the max number each column
df_2015 <- formattable(table_2015, align = c("c","c","c","c","c","c","c"),
list('Jan' = color_bar("#FA614B66", fun = max.col.jan),
'Feb' = color_bar("#FA614B66", fun = max.col.feb),
'Mar' = color_bar("#FA614B66", fun = max.col.mar),
'Apr' = color_bar("#FA614B66", fun = max.col.apr),
'May' = color_bar("#FA614B66", fun = max.col.may2)))
df_2014
| City | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
|---|---|---|---|---|---|---|---|---|
| Auburn | 70 | 94 | 101 | 65 | 68 | 66 | 62 | 77 |
| Bellevue | 113 | 142 | 142 | 149 | 119 | 126 | 89 | 89 |
| Black Diamond | 9 | 4 | 9 | 15 | 10 | 5 | 7 | 3 |
| Bothell | 8 | 18 | 22 | 24 | 13 | 16 | 21 | 20 |
| Carnation | 11 | 11 | 13 | 17 | 7 | 10 | 8 | 15 |
| Duvall | 14 | 25 | 18 | 25 | 14 | 10 | 6 | 12 |
| Enumclaw | 8 | 14 | 22 | 23 | 20 | 25 | 26 | 16 |
| Fall City | 5 | 6 | 8 | 5 | 5 | 12 | 7 | 2 |
| Federal Way | 58 | 62 | 79 | 65 | 60 | 69 | 49 | 77 |
| Issaquah | 72 | 86 | 97 | 71 | 54 | 63 | 41 | 30 |
| Kenmore | 34 | 26 | 25 | 26 | 25 | 19 | 14 | 16 |
| Kent | 81 | 114 | 109 | 99 | 99 | 109 | 73 | 81 |
| Kirkland | 74 | 87 | 120 | 92 | 77 | 87 | 60 | 78 |
| Maple Valley | 31 | 51 | 54 | 54 | 52 | 61 | 41 | 33 |
| Medina | 5 | 3 | 5 | 7 | 4 | 8 | 5 | 0 |
| Mercer Island | 28 | 44 | 26 | 27 | 24 | 13 | 14 | 20 |
| North Bend | 19 | 24 | 32 | 25 | 15 | 24 | 10 | 17 |
| Redmond | 93 | 100 | 128 | 85 | 87 | 84 | 61 | 59 |
| Renton | 123 | 160 | 167 | 127 | 121 | 132 | 111 | 104 |
| Sammamish | 70 | 85 | 92 | 84 | 73 | 50 | 52 | 55 |
| Seattle | 759 | 918 | 857 | 793 | 745 | 816 | 598 | 608 |
| Snoqualmie | 29 | 33 | 32 | 20 | 36 | 22 | 14 | 13 |
| Vashon | 7 | 20 | 15 | 8 | 11 | 10 | 13 | 6 |
| Woodinville | 47 | 53 | 38 | 34 | 35 | 41 | 29 | 40 |
df_2015
| City | Jan | Feb | Mar | Apr | May |
|---|---|---|---|---|---|
| Auburn | 40 | 54 | 96 | 92 | 27 |
| Bellevue | 62 | 87 | 116 | 138 | 35 |
| Black Diamond | 6 | 11 | 6 | 15 | 0 |
| Bothell | 6 | 10 | 14 | 16 | 7 |
| Carnation | 3 | 6 | 7 | 14 | 2 |
| Duvall | 7 | 14 | 17 | 17 | 11 |
| Enumclaw | 14 | 19 | 22 | 19 | 6 |
| Fall City | 7 | 0 | 10 | 11 | 3 |
| Federal Way | 41 | 69 | 65 | 67 | 18 |
| Issaquah | 27 | 42 | 57 | 74 | 19 |
| Kenmore | 17 | 14 | 24 | 38 | 5 |
| Kent | 64 | 81 | 124 | 124 | 45 |
| Kirkland | 46 | 52 | 95 | 83 | 26 |
| Maple Valley | 23 | 42 | 52 | 60 | 36 |
| Medina | 0 | 3 | 3 | 6 | 1 |
| Mercer Island | 7 | 10 | 30 | 34 | 5 |
| North Bend | 12 | 8 | 12 | 16 | 7 |
| Redmond | 42 | 45 | 79 | 92 | 24 |
| Renton | 78 | 99 | 142 | 182 | 51 |
| Sammamish | 35 | 36 | 60 | 91 | 17 |
| Seattle | 399 | 497 | 767 | 941 | 279 |
| Snoqualmie | 13 | 19 | 34 | 37 | 8 |
| Vashon | 3 | 4 | 7 | 9 | 5 |
| Woodinville | 26 | 28 | 36 | 55 | 9 |
From table 1, we can see that the higest number of houses sold in 2014 and 2015 were in Seattle
The table below shows information regarding: Price per squarefoot in each city in 2015, the number of houses sold, average price of the houses, no. of bedrooms and bathrooms.
# Calculating price per sqft for each house in in each city
#filtering data
seattle <- filter(my_data, City == "Seattle")
seattle_new <- as.data.frame.matrix(cbind(seattle$condition, seattle$price,seattle$sqft_living, seattle$bedrooms, seattle$bathrooms))
colnames(seattle_new) <- c("Condition","Price","Sqft","Bedrooms","Bathrooms")
#Condition 1 houses
cond1 <- filter(seattle_new, Condition == "1")
cond1_houses_sold <- nrow(cond1)
cond1_avg_price <- paste("$", round(sum(cond1$Price)/nrow(cond1)))
cond1_avg_sqft <- paste("$", round(sum(cond1$Sqft)/nrow(cond1)))
cond1_avg_price_per_sqft <- paste("$", round(sum(cond1$Price/cond1$Sqft)/cond1_houses_sold))
cond1_avg_bedrooms <- round(sum(cond1$Bedrooms)/nrow(cond1),1)
cond1_avg_bathrooms <- round(sum(cond1$Bathrooms)/nrow(cond1))
#combining columns for condition 1
cbind_1 <- cbind(1,cond1_houses_sold,cond1_avg_price,cond1_avg_sqft,cond1_avg_price_per_sqft,cond1_avg_bedrooms,cond1_avg_bathrooms)
#Condition 2 houses
cond2 <- filter(seattle_new, Condition == "2")
cond2_houses_sold <- nrow(cond2)
cond2_avg_price <- paste("$", round(sum(cond2$Price)/nrow(cond2)))
cond2_avg_sqft <- paste("$", round(sum(cond2$Sqft)/nrow(cond2)))
cond2_avg_price_per_sqft <- paste("$", round(sum(cond2$Price/cond1$Sqft)/cond2_houses_sold))
## Warning in cond2$Price/cond1$Sqft: longer object length is not a multiple
## of shorter object length
cond2_avg_bedrooms <- round(sum(cond2$Bedrooms)/nrow(cond2))
cond2_avg_bathrooms <- round(sum(cond2$Bathrooms)/nrow(cond2))
#combining columns for condition 1
cbind_2 <- cbind(2,cond2_houses_sold,cond2_avg_price,cond2_avg_sqft,cond2_avg_price_per_sqft,cond2_avg_bedrooms,cond2_avg_bathrooms)
#Condition 3 houses
cond3 <- filter(seattle_new, Condition == "3")
cond3_houses_sold <- nrow(cond3)
cond3_avg_price <- paste("$", round(sum(cond3$Price)/nrow(cond3)))
cond3_avg_sqft <- paste("$", round(sum(cond3$Sqft)/nrow(cond3)))
cond3_avg_price_per_sqft <- paste("$", round(sum(cond3$Price/cond1$Sqft)/cond3_houses_sold))
## Warning in cond3$Price/cond1$Sqft: longer object length is not a multiple
## of shorter object length
cond3_avg_bedrooms <- round(sum(cond3$Bedrooms)/nrow(cond3),1)
cond3_avg_bathrooms <- round(sum(cond3$Bathrooms)/nrow(cond3))
#combining columns for condition 3
cbind_3 <- cbind(3,cond3_houses_sold,cond3_avg_price,cond3_avg_sqft,cond3_avg_price_per_sqft,cond3_avg_bedrooms,cond3_avg_bathrooms)
#Condition 4 houses
cond4 <- filter(seattle_new, Condition == "4")
cond4_houses_sold <- nrow(cond4)
cond4_avg_price <- paste("$", round(sum(cond4$Price)/nrow(cond4)))
cond4_avg_sqft <- paste("$", round(sum(cond4$Sqft)/nrow(cond4)))
cond4_avg_price_per_sqft <- paste("$", round(sum(cond4$Price/cond1$Sqft)/cond4_houses_sold))
## Warning in cond4$Price/cond1$Sqft: longer object length is not a multiple
## of shorter object length
cond4_avg_bedrooms <- round(sum(cond4$Bedrooms)/nrow(cond4),1)
cond4_avg_bathrooms <- round(sum(cond4$Bathrooms)/nrow(cond4))
#combining columns for condition 4
cbind_4 <- cbind(4,cond4_houses_sold,cond4_avg_price,cond4_avg_sqft,cond4_avg_price_per_sqft,cond4_avg_bedrooms,cond4_avg_bathrooms)
#Condition 5 houses
cond5 <- filter(seattle_new, Condition == "5")
cond5_houses_sold <- nrow(cond5)
cond5_avg_price <- paste("$", round(sum(cond5$Price)/nrow(cond5)))
cond5_avg_sqft <- paste("$", round(sum(cond5$Sqft)/nrow(cond5)))
cond5_avg_price_per_sqft <- paste("$", round(sum(cond5$Price/cond1$Sqft)/cond5_houses_sold))
## Warning in cond5$Price/cond1$Sqft: longer object length is not a multiple
## of shorter object length
cond5_avg_bedrooms <- round(sum(cond5$Bedrooms)/nrow(cond5),1)
cond5_avg_bathrooms <- round(sum(cond5$Bathrooms)/nrow(cond5))
#combining columns for condition 5
cbind_5 <- cbind(5,cond5_houses_sold,cond5_avg_price,cond5_avg_sqft,cond5_avg_price_per_sqft,cond5_avg_bedrooms,cond5_avg_bathrooms)
# Using rbind to view all the conditions in a matrix form
combine_table <- as.data.frame.matrix(rbind(cbind_1,cbind_2,cbind_3,cbind_4,cbind_5))
# Re-naming the cols in the matrix
colnames(combine_table) <- c("Condition","Houses sold", "Avg Price", "Avg sqft", "Avg price/sqft", "Bedrooms","Bathrooms")
formattable(combine_table, align = c("c","c","c","c","c","c","c"))
| Condition | Houses sold | Avg Price | Avg sqft | Avg price/sqft | Bedrooms | Bathrooms |
|---|---|---|---|---|---|---|
| 1 | 19 | $ 290524 | $ 1206 | $ 266 | 2.5 | 1 |
| 2 | 94 | $ 321673 | $ 1183 | $ 304 | 3 | 1 |
| 3 | 5754 | $ 508620 | $ 1757 | $ 514 | 3.1 | 2 |
| 4 | 2152 | $ 555053 | $ 1813 | $ 557 | 3.2 | 2 |
| 5 | 958 | $ 673844 | $ 2054 | $ 674 | 3.4 | 2 |
From table 2, we can see that, houses in better condition are priced higher. Therefore, there is scope for an investor to buy the house in poor condition, upgrade and renovate it and sell it for a premium price.